bitkeeper revision 1.1041.2.3 (40e33e15ASwSHg8_daLqUfOJezgo6A)
authordjm@kirby.fc.hp.com <djm@kirby.fc.hp.com>
Wed, 30 Jun 2004 22:26:29 +0000 (22:26 +0000)
committerdjm@kirby.fc.hp.com <djm@kirby.fc.hp.com>
Wed, 30 Jun 2004 22:26:29 +0000 (22:26 +0000)
Encapsulate shadow_mode for minimal perturbation portability
and move shadow.[ch] to machdep layer

.rootkeys
BitKeeper/etc/logging_ok
xen/arch/x86/shadow.c [new file with mode: 0644]
xen/common/shadow.c [deleted file]
xen/include/asm-x86/shadow.h [new file with mode: 0644]
xen/include/xen/shadow.h

index feb8ec1995065d41089b6f71b9ada76491a03ac9..26357f061774fed2e83d2cfd3457bb1ddec05b41 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 3ddb79bc1_2bAt67x9MFCP4AZrQnvQ xen/arch/x86/process.c
 3ddb79bc7KxGCEJsgBnkDX7XjD_ZEQ xen/arch/x86/rwlock.c
 3ddb79bcrD6Z_rUvSDgrvjyb4846Eg xen/arch/x86/setup.c
+405b8599xI_PoEr3zZoJ2on-jdn7iw xen/arch/x86/shadow.c
 3ddb79bcSx2e8JSR3pdSGa8x1ScYzA xen/arch/x86/smp.c
 3ddb79bcfUN3-UBCPzX26IU8bq-3aw xen/arch/x86/smpboot.c
 3ddb79bc-Udq7ol-NX4q9XsYnN7A2Q xen/arch/x86/time.c
 40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
 40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
 3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
-405b8599xI_PoEr3zZoJ2on-jdn7iw xen/common/shadow.c
 3ddb79bdB9RNMnkQnUyZ5C9hhMSQQw xen/common/slab.c
 3ddb79bd0gVQYmL2zvuJnldvD0AGxQ xen/common/softirq.c
 3e7f358awXBC3Vw-wFRwPw18qL1khg xen/common/string.c
 3ddb79c2QF5-pZGzuX4QukPCDAl59A xen/include/asm-x86/processor.h
 40cf1596bim9F9DNdV75klgRSZ6Y2A xen/include/asm-x86/ptrace.h
 3ddb79c2plf7ciNgoNjU-RsbUzawsw xen/include/asm-x86/rwlock.h
+405b8599BsDsDwKEJLS0XipaiQW3TA xen/include/asm-x86/shadow.h
 3ddb79c3Hgbb2g8CyWLMCK-6_ZVQSQ xen/include/asm-x86/smp.h
 3ddb79c3jn8ALV_S9W5aeTYUQRKBpg xen/include/asm-x86/smpboot.h
 3ddb79c3NiyQE2vQnyGiaBnNjBO1rA xen/include/asm-x86/spinlock.h
 40589969nPq3DMzv24RDb5LXE9brHw xen/include/xen/sched-if.h
 3ddb79c0LzqqS0LhAQ50ekgj4oGl7Q xen/include/xen/sched.h
 403a06a7H0hpHcKpAiDe5BPnaXWTlA xen/include/xen/serial.h
-405b8599BsDsDwKEJLS0XipaiQW3TA xen/include/xen/shadow.h
+40e3392dib7GrcBAu5cT-EUZTYzeEQ xen/include/xen/shadow.h
 3ddb79c14dXIhP7C2ahnoD08K90G_w xen/include/xen/slab.h
 3ddb79c09xbS-xxfKxuV3JETIhBzmg xen/include/xen/smp.h
 3ddb79c1Vi5VleJAOKHAlY0G2zAsgw xen/include/xen/softirq.h
index 04eb20bb841193f07cf647bdb0fdcaf8f6ca6378..d2431905b51c3a09f858961a14b5b130c1c38854 100644 (file)
@@ -9,6 +9,7 @@ bd240@labyrinth.cl.cam.ac.uk
 br260@br260.wolfson.cam.ac.uk
 br260@labyrinth.cl.cam.ac.uk
 br260@laudney.cl.cam.ac.uk
+djm@kirby.fc.hp.com
 gm281@boulderdash.cl.cam.ac.uk
 iap10@freefall.cl.cam.ac.uk
 iap10@labyrinth.cl.cam.ac.uk
diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c
new file mode 100644 (file)
index 0000000..dc08bd0
--- /dev/null
@@ -0,0 +1,1058 @@
+/* -*-  Mode:C++; c-file-style:BSD; c-basic-offset:4; tab-width:4 -*- */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/shadow.h>
+#include <asm/domain_page.h>
+#include <asm/page.h>
+#include <xen/event.h>
+#include <xen/trace.h>
+
+
+/********
+
+To use these shadow page tables, guests must not rely on the ACCESSED
+and DIRTY bits on L2 pte's being accurate -- they will typically all be set.
+
+I doubt this will break anything. (If guests want to use the va_update
+mechanism they've signed up for this anyhow...)
+
+There's a per-domain shadow table spin lock which works fine for SMP
+hosts. We don't have to worry about interrupts as no shadow operations
+happen in an interrupt context. It's probably not quite ready for SMP
+guest operation as we have to worry about synchonisation between gpte
+and spte updates. Its possible that this might only happen in a
+hypercall context, in which case we'll probably at have a per-domain
+hypercall lock anyhow (at least initially).
+
+********/
+
+
+/**
+
+FIXME:
+
+The shadow table flush command is dangerous on SMP systems as the
+guest may be using the L2 on one CPU while the other is trying to 
+blow the table away. 
+
+The current save restore code works around this by not calling FLUSH,
+but by calling CLEAN2 which leaves all L2s in tact (this is probably
+quicker anyhow).
+
+Even so, we have to be very careful. The flush code may need to cause
+a TLB flush on another CPU. It needs to do this while holding the
+shadow table lock. The trouble is, the guest may be in the shadow page
+fault handler spinning waiting to grab the shadow lock. It may have
+intterupts disabled, hence we can't use the normal flush_tlb_cpu
+mechanism.
+
+For the moment, we have a grim race whereby the spinlock in the shadow
+fault handler is actually a try lock, in a loop with a helper for the
+tlb flush code.
+
+A better soloution would be to take a new flush lock, then raise a
+per-domain soft irq on the other CPU.  The softirq will switch to
+init's PTs, then do an atomic inc of a variable to count himself in,
+then spin on a lock.  Having noticed that the other guy has counted
+in, flush the shadow table, then release him by dropping the lock. He
+will then reload cr3 from mm.page_table on the way out of the softirq.
+
+In domian-softirq context we know that the guy holds no locks and has
+interrupts enabled. Nothing can go wrong ;-)
+
+**/
+
+static inline void free_shadow_page( struct mm_struct *m, 
+                                     struct pfn_info *pfn_info )
+{
+    unsigned long flags;
+    unsigned long type = pfn_info->type_and_flags & PGT_type_mask;
+
+    m->shadow_page_count--;
+
+    if (type == PGT_l1_page_table)
+        perfc_decr(shadow_l1_pages);
+    else if (type == PGT_l2_page_table)
+        perfc_decr(shadow_l2_pages);
+    else printk("Free shadow weird page type pfn=%08x type=%08x\n",
+                frame_table-pfn_info, pfn_info->type_and_flags);
+    
+    pfn_info->type_and_flags = 0;
+
+    spin_lock_irqsave(&free_list_lock, flags);
+    list_add(&pfn_info->list, &free_list);
+    free_pfns++;
+    spin_unlock_irqrestore(&free_list_lock, flags);
+}
+
+static void __free_shadow_table( struct mm_struct *m )
+{
+    int j, free=0;
+    struct shadow_status *a,*next;
+    // the code assumes you're not using the page tables i.e.
+    // the domain is stopped and cr3 is something else!!
+
+    // walk the hash table and call free_shadow_page on all pages
+
+    shadow_audit(m,1);
+
+    for(j=0;j<shadow_ht_buckets;j++)
+    {
+        a = &m->shadow_ht[j];        
+        if (a->pfn)
+        {
+            free_shadow_page( m, 
+                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
+            a->pfn = 0;
+            a->spfn_and_flags = 0;
+            free++;
+        }
+        next=a->next;
+        a->next=NULL;
+        a=next;
+        while(a)
+        { 
+            struct shadow_status *next = a->next;
+
+            free_shadow_page( m, 
+                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
+            a->pfn = 0;
+            a->spfn_and_flags = 0;
+            free++;
+            a->next = m->shadow_ht_free;           
+            m->shadow_ht_free = a;
+            a=next;
+        }
+        shadow_audit(m,0);
+    }
+    SH_LOG("Free shadow table. Freed= %d",free);
+}
+
+
+#define TABLE_OP_ZERO_L2 1
+#define TABLE_OP_ZERO_L1 2
+#define TABLE_OP_FREE_L1 3
+
+static inline int shadow_page_op( struct mm_struct *m, unsigned int op, 
+                                                                 unsigned int gpfn,
+                                  struct pfn_info *spfn_info, int *work )
+{
+    unsigned int spfn = spfn_info-frame_table;
+       int restart = 0;
+
+    switch( op )
+    {
+       case TABLE_OP_ZERO_L2:
+       {
+               if ( (spfn_info->type_and_flags & PGT_type_mask) == 
+             PGT_l2_page_table )
+               {
+                       unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
+#ifdef __i386__
+                       memset(spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e));
+#endif
+                       unmap_domain_mem( spl1e );
+               }
+    }
+       break;
+       
+       case TABLE_OP_ZERO_L1:
+       {
+               if ( (spfn_info->type_and_flags & PGT_type_mask) == 
+             PGT_l1_page_table )
+               {
+                       unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
+                       memset( spl1e, 0, ENTRIES_PER_L1_PAGETABLE * sizeof(*spl1e) );
+                       unmap_domain_mem( spl1e );
+               }
+    }
+       break;
+
+       case TABLE_OP_FREE_L1:
+       {
+               if ( (spfn_info->type_and_flags & PGT_type_mask) == 
+             PGT_l1_page_table )
+               {
+                       // lock is already held
+                       delete_shadow_status( m, gpfn );
+                       restart = 1; // we need to go to start of list again
+               }
+    }
+
+       break;
+       
+       default:
+               BUG();
+
+    }
+    return restart;
+}
+
+static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
+{
+    int j, work=0;
+    struct shadow_status *a, *next;
+    // the code assumes you're not using the page tables i.e.
+    // the domain is stopped and cr3 is something else!!
+
+    // walk the hash table and call free_shadow_page on all pages
+
+    shadow_audit(m,1);
+
+    for(j=0;j<shadow_ht_buckets;j++)
+    {
+       retry:
+        a = &m->shadow_ht[j];     
+               next = a->next;
+        if (a->pfn)
+        {
+            if ( shadow_page_op( m, op, a->pfn,                                                                 
+                                                                &frame_table[a->spfn_and_flags & PSH_pfn_mask], 
+                                                                &work ) )
+                               goto retry;
+        }
+        a=next;
+        while(a)
+        { 
+                       next = a->next;
+            if ( shadow_page_op( m, op, a->pfn,
+                                                                &frame_table[a->spfn_and_flags & PSH_pfn_mask],
+                                                                &work ) )
+                               goto retry;
+            a=next;
+        }
+        shadow_audit(m,0);
+    }
+    SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
+}
+
+
+void shadow_mode_init(void)
+{
+}
+
+int shadow_mode_enable( struct domain *p, unsigned int mode )
+{
+    struct mm_struct *m = &p->mm;
+    struct shadow_status **fptr;
+    int i;
+
+    m->shadow_mode = mode;
+    // allocate hashtable
+    m->shadow_ht = kmalloc(shadow_ht_buckets * 
+                           sizeof(struct shadow_status));
+    if( m->shadow_ht == NULL )
+        goto nomem;
+
+    memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status));
+
+    // allocate space for first lot of extra nodes
+    m->shadow_ht_extras = kmalloc(sizeof(void*) + 
+                                  (shadow_ht_extra_size * 
+                                   sizeof(struct shadow_status)));
+    if( m->shadow_ht_extras == NULL )
+        goto nomem;
+
+    memset( m->shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size * 
+                                                     sizeof(struct shadow_status)) );
+
+    m->shadow_extras_count++;
+    // add extras to free list
+    fptr = &m->shadow_ht_free;
+    for ( i=0; i<shadow_ht_extra_size; i++ )
+    {
+        *fptr = &m->shadow_ht_extras[i];
+        fptr = &(m->shadow_ht_extras[i].next);
+    }
+    *fptr = NULL;
+    *((struct shadow_status ** ) 
+      &m->shadow_ht_extras[shadow_ht_extra_size]) = NULL;
+
+    if ( mode == SHM_logdirty )
+    {
+        m->shadow_dirty_bitmap_size = (p->max_pages+63)&(~63);
+        m->shadow_dirty_bitmap = 
+            kmalloc( m->shadow_dirty_bitmap_size/8);
+        if( m->shadow_dirty_bitmap == NULL )
+        {
+            m->shadow_dirty_bitmap_size = 0;
+            goto nomem;
+        }
+        memset(m->shadow_dirty_bitmap,0,m->shadow_dirty_bitmap_size/8);
+    }
+
+    // call shadow_mk_pagetable
+    __shadow_mk_pagetable( m );
+    return 0;
+
+nomem:
+    return -ENOMEM;
+}
+
+void shadow_mode_disable( struct domain *p )
+{
+    struct mm_struct *m = &p->mm;
+    struct shadow_status *next;
+
+    __free_shadow_table( m );
+    m->shadow_mode = 0;
+
+    SH_LOG("freed tables count=%d l1=%d l2=%d",
+           m->shadow_page_count, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
+
+    next = m->shadow_ht_extras;
+    while( next )
+    {
+        struct shadow_status * this = next;
+        m->shadow_extras_count--;
+        next = *((struct shadow_status **)(&next[shadow_ht_extra_size]));
+        kfree( this );
+    }
+
+    SH_LOG("freed extras, now %d", m->shadow_extras_count);
+
+    if( m->shadow_dirty_bitmap  )
+    {
+        kfree( m->shadow_dirty_bitmap );
+        m->shadow_dirty_bitmap = 0;
+        m->shadow_dirty_bitmap_size = 0;
+    }
+
+    // free the hashtable itself
+    kfree( &m->shadow_ht[0] );
+}
+
+static int shadow_mode_table_op(struct domain *d, 
+                                                           dom0_shadow_control_t *sc)
+{
+    unsigned int op = sc->op;
+    struct mm_struct *m = &d->mm;
+    int rc = 0;
+
+    // since Dom0 did the hypercall, we should be running with it's page
+    // tables right now. Calling flush on yourself would be really
+    // stupid.
+
+    ASSERT(spin_is_locked(&d->mm.shadow_lock));
+
+    if ( m == &current->mm )
+    {
+        printk("Don't try and flush your own page tables!\n");
+        return -EINVAL;
+    }
+   
+    SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
+
+    shadow_audit(m,1);
+
+    switch(op)
+    {
+    case DOM0_SHADOW_CONTROL_OP_FLUSH:
+        // XXX THIS IS VERY DANGEROUS : MUST ENSURE THE PTs ARE NOT IN USE ON
+               // OTHER CPU -- fix when we get sched sync pause.
+        __free_shadow_table( m );  
+        break;
+   
+    case DOM0_SHADOW_CONTROL_OP_CLEAN:   // zero all-non hypervisor
+       {
+               __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
+               __scan_shadow_table( m, TABLE_OP_ZERO_L1 );
+
+               goto send_bitmap;
+       }
+               
+
+    case DOM0_SHADOW_CONTROL_OP_CLEAN2:  // zero all L2, free L1s
+    {
+               int i,j,zero=1;
+               
+               __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
+               __scan_shadow_table( m, TABLE_OP_FREE_L1 );
+               
+       send_bitmap:
+               sc->stats.fault_count       = d->mm.shadow_fault_count;
+               sc->stats.dirty_count       = d->mm.shadow_dirty_count;
+               sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
+               sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
+
+               d->mm.shadow_fault_count       = 0;
+               d->mm.shadow_dirty_count       = 0;
+               d->mm.shadow_dirty_net_count   = 0;
+               d->mm.shadow_dirty_block_count = 0;
+       
+               sc->pages = d->tot_pages;
+
+               if( d->tot_pages > sc->pages || 
+                       !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
+               {
+                       rc = -EINVAL;
+                       goto out;
+               }
+
+       
+#define chunk (8*1024) // do this in 1KB chunks for L1 cache
+       
+               for(i=0;i<d->tot_pages;i+=chunk)
+               {
+                       int bytes = ((  ((d->tot_pages-i) > (chunk))?
+                                                       (chunk):(d->tot_pages-i) ) + 7) / 8;
+           
+                       copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                                                 d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                                                 bytes );
+           
+                       for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
+                       {
+                               if( d->mm.shadow_dirty_bitmap[j] != 0 )
+                                       zero = 0;
+                       }
+
+                       memset( d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                                       0, bytes);
+               }
+
+        /* Might as well stop the domain as an optimization. */
+               if ( zero )
+            domain_pause_by_systemcontroller(d);
+
+               break;
+    }
+
+    case DOM0_SHADOW_CONTROL_OP_PEEK:
+    {
+               int i;
+
+               sc->stats.fault_count       = d->mm.shadow_fault_count;
+               sc->stats.dirty_count       = d->mm.shadow_dirty_count;
+               sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
+               sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
+       
+               if( d->tot_pages > sc->pages || 
+                       !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
+               {
+                       rc = -EINVAL;
+                       goto out;
+               }
+       
+               sc->pages = d->tot_pages;
+       
+#define chunk (8*1024) // do this in 1KB chunks for L1 cache
+       
+               for(i=0;i<d->tot_pages;i+=chunk)
+               {
+                       int bytes = ((  ((d->tot_pages-i) > (chunk))?
+                                                       (chunk):(d->tot_pages-i) ) + 7) / 8;
+           
+                       copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                                                 d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                                                 bytes );          
+               }
+
+               break;
+    }
+
+       default:
+               BUG();
+
+    }
+
+
+out:
+
+    SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
+
+    shadow_audit(m,1);
+
+    // call shadow_mk_pagetable
+    __shadow_mk_pagetable( m );
+
+    return rc;
+}
+
+int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc )
+{
+    unsigned int cmd = sc->op;
+    int rc = 0;
+
+    spin_lock(&p->mm.shadow_lock);
+
+    if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
+    {
+        shadow_mode_disable(p);
+    }
+    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
+    {
+        if(p->mm.shadow_mode) shadow_mode_disable(p);
+        shadow_mode_enable(p, SHM_test);
+    } 
+    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
+    {
+        if(p->mm.shadow_mode) shadow_mode_disable(p);
+        shadow_mode_enable(p, SHM_logdirty);
+    } 
+    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
+    {
+        rc = shadow_mode_table_op(p, sc);
+    }
+    else
+    {
+        rc = -EINVAL;
+    }
+
+       flush_tlb_cpu(p->processor);
+   
+    spin_unlock(&p->mm.shadow_lock);
+
+    return rc;
+}
+
+
+
+static inline struct pfn_info *alloc_shadow_page( struct mm_struct *m )
+{
+    m->shadow_page_count++;
+
+    return alloc_domain_page( NULL );
+}
+
+
+void unshadow_table( unsigned long gpfn, unsigned int type )
+{
+    unsigned long spfn;
+
+    SH_VLOG("unshadow_table type=%08x gpfn=%08lx",
+            type,
+            gpfn );
+
+    perfc_incrc(unshadow_table_count);
+
+    // this function is the same for both l1 and l2 tables
+
+    // even in the SMP guest case, there won't be a race here as
+    // this CPU was the one that cmpxchg'ed the page to invalid
+
+    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
+
+    delete_shadow_status(&current->mm, gpfn);
+
+    free_shadow_page( &current->mm, &frame_table[spfn] );
+
+}
+
+
+unsigned long shadow_l2_table( 
+    struct mm_struct *m, unsigned long gpfn )
+{
+    struct pfn_info *spfn_info;
+    unsigned long spfn;
+    l2_pgentry_t *spl2e, *gpl2e;
+    int i;
+
+    SH_VVLOG("shadow_l2_table( %08lx )",gpfn);
+
+    perfc_incrc(shadow_l2_table_count);
+
+    // XXX in future, worry about racing in SMP guests 
+    //      -- use cmpxchg with PSH_pending flag to show progress (and spin)
+
+    spfn_info = alloc_shadow_page(m);
+
+    ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache
+
+    spfn_info->type_and_flags = PGT_l2_page_table;
+    perfc_incr(shadow_l2_pages);
+
+    spfn = (unsigned long) (spfn_info - frame_table);
+
+    // mark pfn as being shadowed, update field to point at shadow
+    set_shadow_status(m, gpfn, spfn | PSH_shadowed);
+    // we need to do this before the linear map is set up
+    spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
+
+#ifdef __i386__
+    // get hypervisor and 2x linear PT mapings installed 
+    memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+    spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry(__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | 
+                      __PAGE_HYPERVISOR);
+#endif
+
+    // can't use the linear map as we may not be in the right PT
+    gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
+
+    // proactively create entries for pages that are already shadowed
+    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+    {
+        unsigned long spte = 0;
+
+#if 0  // Turns out this doesn't really help
+        unsigned long gpte;
+
+        gpte = l2_pgentry_val(gpl2e[i]);
+
+        if (gpte & _PAGE_PRESENT)
+        {
+            unsigned long s_sh = 
+                __shadow_status(p, gpte>>PAGE_SHIFT);
+
+            l2pde_general( m, &gpte, &spte, s_sh );
+
+        }
+#endif
+
+        spl2e[i] = mk_l2_pgentry( spte );
+
+    }
+
+    // its arguable we should 'preemptively shadow' a few active L1 pages
+    // to avoid taking a string of faults when 'jacking' a running domain
+
+    unmap_domain_mem( gpl2e );
+    unmap_domain_mem( spl2e );
+
+    SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn);
+
+    return spfn;
+}
+
+
+int shadow_fault( unsigned long va, long error_code )
+{
+    unsigned long gpte, spte;
+    struct mm_struct *m = &current->mm;
+
+    SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code );
+
+    check_pagetable( current, current->mm.pagetable, "pre-sf" );
+
+    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
+    {
+        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
+        return 0;  // propagate to guest
+    }
+
+    if ( ! (gpte & _PAGE_PRESENT) )
+    {
+        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
+        return 0;  // we're not going to be able to help
+    }
+
+    if ( (error_code & 2)  && ! (gpte & _PAGE_RW) )
+    {
+        // write fault on RO page
+        return 0;
+    }
+
+    // take the lock and reread gpte
+
+    while( unlikely(!spin_trylock(&current->mm.shadow_lock)) )
+       {
+               extern volatile unsigned long flush_cpumask;
+               if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
+                       local_flush_tlb();
+               rep_nop();
+       }
+       
+       ASSERT(spin_is_locked(&current->mm.shadow_lock));
+       
+    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
+    {
+        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
+        spin_unlock(&m->shadow_lock);
+        return 0;  // propagate to guest
+    }
+
+    if ( unlikely(!(gpte & _PAGE_PRESENT)) )
+    {
+        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
+        spin_unlock(&m->shadow_lock);
+        return 0;  // we're not going to be able to help
+    }
+
+    if ( error_code & 2  )  
+    {  // write fault
+        if ( likely(gpte & _PAGE_RW) )
+        {
+            l1pte_write_fault( m, &gpte, &spte );
+        }
+        else
+        {   // write fault on RO page
+            SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte );
+            spin_unlock(&m->shadow_lock);
+            return 0; // propagate to guest
+            // not clear whether we should set accessed bit here...
+        }
+    }
+    else
+    {
+        l1pte_read_fault( m, &gpte, &spte );
+    }
+
+    SH_VVLOG("plan: gpte=%08lx  spte=%08lx", gpte, spte );
+
+    // write back updated gpte
+    // XXX watch out for read-only L2 entries! (not used in Linux)
+    if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
+        BUG();  // fixme!
+
+    if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) )
+    { 
+        // failed:
+        //  the L1 may not be shadowed, or the L2 entry may be insufficient
+
+        unsigned long gpde, spde, gl1pfn, sl1pfn;
+
+        SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx  spte=%08lx",gpte,spte );
+
+        gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]);
+
+        gl1pfn = gpde>>PAGE_SHIFT;
+
+        
+        if ( ! (sl1pfn=__shadow_status(&current->mm, gl1pfn) ) )
+        {
+            // this L1 is NOT already shadowed so we need to shadow it
+            struct pfn_info *sl1pfn_info;
+            unsigned long *gpl1e, *spl1e;
+            int i;
+            sl1pfn_info = alloc_shadow_page( &current->mm ); 
+            sl1pfn_info->type_and_flags = PGT_l1_page_table;
+                       
+            sl1pfn = sl1pfn_info - frame_table;
+
+            SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn);
+            perfc_incrc(shadow_l1_table_count);
+            perfc_incr(shadow_l1_pages);
+
+            set_shadow_status(&current->mm, gl1pfn, PSH_shadowed | sl1pfn);
+
+            l2pde_general( m, &gpde, &spde, sl1pfn );
+
+            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
+            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] =  mk_l2_pgentry(spde);
+
+            gpl1e = (unsigned long *) &(linear_pg_table[
+                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]);
+
+            spl1e = (unsigned long *) &shadow_linear_pg_table[
+                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ];
+
+
+            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+            {
+                l1pte_no_fault( m, &gpl1e[i], &spl1e[i] );
+            }
+
+
+        }
+        else
+        {
+            // this L1 was shadowed (by another PT) but we didn't have an L2
+            // entry for it
+
+            SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn);
+
+            l2pde_general( m, &gpde, &spde, sl1pfn );
+
+            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
+            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
+   
+        }              
+
+        shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte);
+        // (we need to do the above even if we've just made the shadow L1)
+
+    } // end of fixup writing the shadow L1 directly failed
+     
+    perfc_incrc(shadow_fixup_count);
+
+       m->shadow_fault_count++;
+
+    check_pagetable( current, current->mm.pagetable, "post-sf" );
+
+    spin_unlock(&m->shadow_lock);
+
+    return 1; // let's try the faulting instruction again...
+
+}
+
+
+void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
+                                 unsigned long *prev_spfn_ptr,
+                                 l1_pgentry_t **prev_spl1e_ptr )
+{
+    unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr;    
+    l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr;
+
+
+    SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%p\n",
+             pa,gpte,prev_spfn, prev_spl1e);
+
+    // to get here, we know the l1 page *must* be shadowed
+
+    gpfn = pa >> PAGE_SHIFT;
+    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
+
+    if ( spfn == prev_spfn )
+    {
+        spl1e = prev_spl1e;
+    }
+    else
+    {
+        if( prev_spl1e ) unmap_domain_mem( prev_spl1e );
+        spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+        *prev_spfn_ptr  = spfn;
+        *prev_spl1e_ptr = spl1e;
+    }
+
+    // XXX we assume only pagetables can be shadowed; 
+    // this will have to change to allow arbitrary CoW etc.
+
+    l1pte_no_fault( &current->mm, &gpte, &spte );
+
+
+    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = mk_l1_pgentry( spte );
+
+}
+
+void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte )
+{
+    unsigned long gpfn, spfn, spte;
+    l2_pgentry_t * sp2le;
+    unsigned long s_sh=0;
+
+    SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte);
+
+    // to get here, we know the l2 page has a shadow
+
+    gpfn = pa >> PAGE_SHIFT;
+    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
+
+
+    spte = 0;
+
+    if( gpte & _PAGE_PRESENT )
+        s_sh = __shadow_status(&current->mm, gpte >> PAGE_SHIFT);
+
+    sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+    // no real need for a cache here
+
+    l2pde_general( &current->mm, &gpte, &spte, s_sh );
+
+    // XXXX Should mark guest pte as DIRTY and ACCESSED too!!!!!
+
+    sp2le[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t) ] = 
+        mk_l2_pgentry( spte );
+
+    unmap_domain_mem( (void *) sp2le );
+}
+
+
+#if SHADOW_DEBUG
+
+static int sh_l2_present;
+static int sh_l1_present;
+char * sh_check_name;
+
+#define FAIL(_f, _a...)                             \
+{printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n",  sh_check_name, level, i, ## _a , gpte, spte ); BUG();}
+
+static int check_pte( struct mm_struct *m, 
+                      unsigned long gpte, unsigned long spte, int level, int i )
+{
+    unsigned long mask, gpfn, spfn;
+
+    if ( spte == 0 || spte == 0xdeadface || spte == 0x00000E00)
+        return 1;  // always safe
+
+    if ( !(spte & _PAGE_PRESENT) )
+        FAIL("Non zero not present spte");
+
+    if( level == 2 ) sh_l2_present++;
+    if( level == 1 ) sh_l1_present++;
+
+    if ( !(gpte & _PAGE_PRESENT) )
+        FAIL("Guest not present yet shadow is");
+
+    mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000);
+
+    if ( (spte & mask) != (gpte & mask ) )
+        FAIL("Corrupt?");
+
+    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
+        FAIL("Dirty coherence");
+
+    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
+        FAIL("Accessed coherence");
+
+    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
+        FAIL("RW coherence");
+
+    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY) ))
+        FAIL("RW2 coherence");
+    spfn = spte>>PAGE_SHIFT;
+    gpfn = gpte>>PAGE_SHIFT;
+
+    if ( gpfn == spfn )
+    {
+        if ( level > 1 )
+            FAIL("Linear map ???");    // XXX this will fail on BSD
+
+        return 1;
+    }
+    else
+    {
+        if ( level < 2 )
+            FAIL("Shadow in L1 entry?");
+
+        if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) )
+            FAIL("spfn problem g.sf=%08lx", 
+                 __shadow_status(p, gpfn) );
+    }
+
+    return 1;
+}
+
+
+static int check_l1_table( struct mm_struct *m, unsigned long va, 
+                           unsigned long g2, unsigned long s2 )
+{
+    int j;
+    unsigned long *gpl1e, *spl1e;
+
+    //gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]);
+    //spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]);
+
+    gpl1e = map_domain_mem( g2<<PAGE_SHIFT );
+    spl1e = map_domain_mem( s2<<PAGE_SHIFT );
+
+    for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
+    {
+        unsigned long gpte = gpl1e[j];
+        unsigned long spte = spl1e[j];
+  
+        check_pte( p, gpte, spte, 1, j );
+    }
+    unmap_domain_mem( spl1e );
+    unmap_domain_mem( gpl1e );
+
+    return 1;
+}
+
+#define FAILPT(_f, _a...)                             \
+{printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); BUG();}
+
+int check_pagetable( struct mm_struct *m, pagetable_t pt, char *s )
+{
+    unsigned long gptbase = pagetable_val(pt);
+    unsigned long gpfn, spfn;
+    int i;
+    l2_pgentry_t *gpl2e, *spl2e;
+
+    sh_check_name = s;
+
+    SH_VVLOG("%s-PT Audit",s);
+
+    sh_l2_present = sh_l1_present = 0;
+
+    gpfn =  gptbase >> PAGE_SHIFT;
+
+    if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) )
+    {
+        printk("%s-PT %08lx not shadowed\n", s, gptbase);
+
+        if( __shadow_status(p, gpfn) != 0 ) BUG();
+
+        return 0;
+    }
+    spfn = __shadow_status(p, gpfn) & PSH_pfn_mask;
+
+    if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) )
+        FAILPT("ptbase shadow inconsistent1");
+
+    gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
+    spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+
+    //ipl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+
+
+    if ( memcmp( &spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+                 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+                 ((SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT))-DOMAIN_ENTRIES_PER_L2_PAGETABLE)
+                 * sizeof(l2_pgentry_t)) )
+    {
+        printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
+        for (i=DOMAIN_ENTRIES_PER_L2_PAGETABLE; 
+             i<(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT));
+             i++ )
+            printk("+++ (%d) %08lx %08lx\n",i,
+                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]) );
+        FAILPT("hypervisor entries inconsistent");
+    }
+
+    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
+          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
+        FAILPT("hypervisor linear map inconsistent");
+
+    if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
+          ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
+        FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx",
+               l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]),
+               (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR
+            );
+
+    if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
+          ((__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) )
+        FAILPT("hypervisor per-domain map inconsistent");
+
+
+    // check the whole L2
+    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+    {
+        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
+        unsigned long spte = l2_pgentry_val(spl2e[i]);
+
+        check_pte( p, gpte, spte, 2, i );
+    }
+
+
+    // go back and recurse
+    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+    {
+        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
+        unsigned long spte = l2_pgentry_val(spl2e[i]);
+
+        if ( spte )    
+            check_l1_table( p,
+                            i<<L2_PAGETABLE_SHIFT,
+                            gpte>>PAGE_SHIFT, spte>>PAGE_SHIFT );
+
+    }
+
+    unmap_domain_mem( spl2e );
+    unmap_domain_mem( gpl2e );
+
+    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n",
+             sh_l2_present, sh_l1_present );
+    return 1;
+}
+
+
+#endif
diff --git a/xen/common/shadow.c b/xen/common/shadow.c
deleted file mode 100644 (file)
index dc08bd0..0000000
+++ /dev/null
@@ -1,1058 +0,0 @@
-/* -*-  Mode:C++; c-file-style:BSD; c-basic-offset:4; tab-width:4 -*- */
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/shadow.h>
-#include <asm/domain_page.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/trace.h>
-
-
-/********
-
-To use these shadow page tables, guests must not rely on the ACCESSED
-and DIRTY bits on L2 pte's being accurate -- they will typically all be set.
-
-I doubt this will break anything. (If guests want to use the va_update
-mechanism they've signed up for this anyhow...)
-
-There's a per-domain shadow table spin lock which works fine for SMP
-hosts. We don't have to worry about interrupts as no shadow operations
-happen in an interrupt context. It's probably not quite ready for SMP
-guest operation as we have to worry about synchonisation between gpte
-and spte updates. Its possible that this might only happen in a
-hypercall context, in which case we'll probably at have a per-domain
-hypercall lock anyhow (at least initially).
-
-********/
-
-
-/**
-
-FIXME:
-
-The shadow table flush command is dangerous on SMP systems as the
-guest may be using the L2 on one CPU while the other is trying to 
-blow the table away. 
-
-The current save restore code works around this by not calling FLUSH,
-but by calling CLEAN2 which leaves all L2s in tact (this is probably
-quicker anyhow).
-
-Even so, we have to be very careful. The flush code may need to cause
-a TLB flush on another CPU. It needs to do this while holding the
-shadow table lock. The trouble is, the guest may be in the shadow page
-fault handler spinning waiting to grab the shadow lock. It may have
-intterupts disabled, hence we can't use the normal flush_tlb_cpu
-mechanism.
-
-For the moment, we have a grim race whereby the spinlock in the shadow
-fault handler is actually a try lock, in a loop with a helper for the
-tlb flush code.
-
-A better soloution would be to take a new flush lock, then raise a
-per-domain soft irq on the other CPU.  The softirq will switch to
-init's PTs, then do an atomic inc of a variable to count himself in,
-then spin on a lock.  Having noticed that the other guy has counted
-in, flush the shadow table, then release him by dropping the lock. He
-will then reload cr3 from mm.page_table on the way out of the softirq.
-
-In domian-softirq context we know that the guy holds no locks and has
-interrupts enabled. Nothing can go wrong ;-)
-
-**/
-
-static inline void free_shadow_page( struct mm_struct *m, 
-                                     struct pfn_info *pfn_info )
-{
-    unsigned long flags;
-    unsigned long type = pfn_info->type_and_flags & PGT_type_mask;
-
-    m->shadow_page_count--;
-
-    if (type == PGT_l1_page_table)
-        perfc_decr(shadow_l1_pages);
-    else if (type == PGT_l2_page_table)
-        perfc_decr(shadow_l2_pages);
-    else printk("Free shadow weird page type pfn=%08x type=%08x\n",
-                frame_table-pfn_info, pfn_info->type_and_flags);
-    
-    pfn_info->type_and_flags = 0;
-
-    spin_lock_irqsave(&free_list_lock, flags);
-    list_add(&pfn_info->list, &free_list);
-    free_pfns++;
-    spin_unlock_irqrestore(&free_list_lock, flags);
-}
-
-static void __free_shadow_table( struct mm_struct *m )
-{
-    int j, free=0;
-    struct shadow_status *a,*next;
-    // the code assumes you're not using the page tables i.e.
-    // the domain is stopped and cr3 is something else!!
-
-    // walk the hash table and call free_shadow_page on all pages
-
-    shadow_audit(m,1);
-
-    for(j=0;j<shadow_ht_buckets;j++)
-    {
-        a = &m->shadow_ht[j];        
-        if (a->pfn)
-        {
-            free_shadow_page( m, 
-                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
-            a->pfn = 0;
-            a->spfn_and_flags = 0;
-            free++;
-        }
-        next=a->next;
-        a->next=NULL;
-        a=next;
-        while(a)
-        { 
-            struct shadow_status *next = a->next;
-
-            free_shadow_page( m, 
-                              &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
-            a->pfn = 0;
-            a->spfn_and_flags = 0;
-            free++;
-            a->next = m->shadow_ht_free;           
-            m->shadow_ht_free = a;
-            a=next;
-        }
-        shadow_audit(m,0);
-    }
-    SH_LOG("Free shadow table. Freed= %d",free);
-}
-
-
-#define TABLE_OP_ZERO_L2 1
-#define TABLE_OP_ZERO_L1 2
-#define TABLE_OP_FREE_L1 3
-
-static inline int shadow_page_op( struct mm_struct *m, unsigned int op, 
-                                                                 unsigned int gpfn,
-                                  struct pfn_info *spfn_info, int *work )
-{
-    unsigned int spfn = spfn_info-frame_table;
-       int restart = 0;
-
-    switch( op )
-    {
-       case TABLE_OP_ZERO_L2:
-       {
-               if ( (spfn_info->type_and_flags & PGT_type_mask) == 
-             PGT_l2_page_table )
-               {
-                       unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
-#ifdef __i386__
-                       memset(spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e));
-#endif
-                       unmap_domain_mem( spl1e );
-               }
-    }
-       break;
-       
-       case TABLE_OP_ZERO_L1:
-       {
-               if ( (spfn_info->type_and_flags & PGT_type_mask) == 
-             PGT_l1_page_table )
-               {
-                       unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
-                       memset( spl1e, 0, ENTRIES_PER_L1_PAGETABLE * sizeof(*spl1e) );
-                       unmap_domain_mem( spl1e );
-               }
-    }
-       break;
-
-       case TABLE_OP_FREE_L1:
-       {
-               if ( (spfn_info->type_and_flags & PGT_type_mask) == 
-             PGT_l1_page_table )
-               {
-                       // lock is already held
-                       delete_shadow_status( m, gpfn );
-                       restart = 1; // we need to go to start of list again
-               }
-    }
-
-       break;
-       
-       default:
-               BUG();
-
-    }
-    return restart;
-}
-
-static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
-{
-    int j, work=0;
-    struct shadow_status *a, *next;
-    // the code assumes you're not using the page tables i.e.
-    // the domain is stopped and cr3 is something else!!
-
-    // walk the hash table and call free_shadow_page on all pages
-
-    shadow_audit(m,1);
-
-    for(j=0;j<shadow_ht_buckets;j++)
-    {
-       retry:
-        a = &m->shadow_ht[j];     
-               next = a->next;
-        if (a->pfn)
-        {
-            if ( shadow_page_op( m, op, a->pfn,                                                                 
-                                                                &frame_table[a->spfn_and_flags & PSH_pfn_mask], 
-                                                                &work ) )
-                               goto retry;
-        }
-        a=next;
-        while(a)
-        { 
-                       next = a->next;
-            if ( shadow_page_op( m, op, a->pfn,
-                                                                &frame_table[a->spfn_and_flags & PSH_pfn_mask],
-                                                                &work ) )
-                               goto retry;
-            a=next;
-        }
-        shadow_audit(m,0);
-    }
-    SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
-}
-
-
-void shadow_mode_init(void)
-{
-}
-
-int shadow_mode_enable( struct domain *p, unsigned int mode )
-{
-    struct mm_struct *m = &p->mm;
-    struct shadow_status **fptr;
-    int i;
-
-    m->shadow_mode = mode;
-    // allocate hashtable
-    m->shadow_ht = kmalloc(shadow_ht_buckets * 
-                           sizeof(struct shadow_status));
-    if( m->shadow_ht == NULL )
-        goto nomem;
-
-    memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status));
-
-    // allocate space for first lot of extra nodes
-    m->shadow_ht_extras = kmalloc(sizeof(void*) + 
-                                  (shadow_ht_extra_size * 
-                                   sizeof(struct shadow_status)));
-    if( m->shadow_ht_extras == NULL )
-        goto nomem;
-
-    memset( m->shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size * 
-                                                     sizeof(struct shadow_status)) );
-
-    m->shadow_extras_count++;
-    // add extras to free list
-    fptr = &m->shadow_ht_free;
-    for ( i=0; i<shadow_ht_extra_size; i++ )
-    {
-        *fptr = &m->shadow_ht_extras[i];
-        fptr = &(m->shadow_ht_extras[i].next);
-    }
-    *fptr = NULL;
-    *((struct shadow_status ** ) 
-      &m->shadow_ht_extras[shadow_ht_extra_size]) = NULL;
-
-    if ( mode == SHM_logdirty )
-    {
-        m->shadow_dirty_bitmap_size = (p->max_pages+63)&(~63);
-        m->shadow_dirty_bitmap = 
-            kmalloc( m->shadow_dirty_bitmap_size/8);
-        if( m->shadow_dirty_bitmap == NULL )
-        {
-            m->shadow_dirty_bitmap_size = 0;
-            goto nomem;
-        }
-        memset(m->shadow_dirty_bitmap,0,m->shadow_dirty_bitmap_size/8);
-    }
-
-    // call shadow_mk_pagetable
-    __shadow_mk_pagetable( m );
-    return 0;
-
-nomem:
-    return -ENOMEM;
-}
-
-void shadow_mode_disable( struct domain *p )
-{
-    struct mm_struct *m = &p->mm;
-    struct shadow_status *next;
-
-    __free_shadow_table( m );
-    m->shadow_mode = 0;
-
-    SH_LOG("freed tables count=%d l1=%d l2=%d",
-           m->shadow_page_count, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
-
-    next = m->shadow_ht_extras;
-    while( next )
-    {
-        struct shadow_status * this = next;
-        m->shadow_extras_count--;
-        next = *((struct shadow_status **)(&next[shadow_ht_extra_size]));
-        kfree( this );
-    }
-
-    SH_LOG("freed extras, now %d", m->shadow_extras_count);
-
-    if( m->shadow_dirty_bitmap  )
-    {
-        kfree( m->shadow_dirty_bitmap );
-        m->shadow_dirty_bitmap = 0;
-        m->shadow_dirty_bitmap_size = 0;
-    }
-
-    // free the hashtable itself
-    kfree( &m->shadow_ht[0] );
-}
-
-static int shadow_mode_table_op(struct domain *d, 
-                                                           dom0_shadow_control_t *sc)
-{
-    unsigned int op = sc->op;
-    struct mm_struct *m = &d->mm;
-    int rc = 0;
-
-    // since Dom0 did the hypercall, we should be running with it's page
-    // tables right now. Calling flush on yourself would be really
-    // stupid.
-
-    ASSERT(spin_is_locked(&d->mm.shadow_lock));
-
-    if ( m == &current->mm )
-    {
-        printk("Don't try and flush your own page tables!\n");
-        return -EINVAL;
-    }
-   
-    SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
-
-    shadow_audit(m,1);
-
-    switch(op)
-    {
-    case DOM0_SHADOW_CONTROL_OP_FLUSH:
-        // XXX THIS IS VERY DANGEROUS : MUST ENSURE THE PTs ARE NOT IN USE ON
-               // OTHER CPU -- fix when we get sched sync pause.
-        __free_shadow_table( m );  
-        break;
-   
-    case DOM0_SHADOW_CONTROL_OP_CLEAN:   // zero all-non hypervisor
-       {
-               __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
-               __scan_shadow_table( m, TABLE_OP_ZERO_L1 );
-
-               goto send_bitmap;
-       }
-               
-
-    case DOM0_SHADOW_CONTROL_OP_CLEAN2:  // zero all L2, free L1s
-    {
-               int i,j,zero=1;
-               
-               __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
-               __scan_shadow_table( m, TABLE_OP_FREE_L1 );
-               
-       send_bitmap:
-               sc->stats.fault_count       = d->mm.shadow_fault_count;
-               sc->stats.dirty_count       = d->mm.shadow_dirty_count;
-               sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
-               sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
-
-               d->mm.shadow_fault_count       = 0;
-               d->mm.shadow_dirty_count       = 0;
-               d->mm.shadow_dirty_net_count   = 0;
-               d->mm.shadow_dirty_block_count = 0;
-       
-               sc->pages = d->tot_pages;
-
-               if( d->tot_pages > sc->pages || 
-                       !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
-               {
-                       rc = -EINVAL;
-                       goto out;
-               }
-
-       
-#define chunk (8*1024) // do this in 1KB chunks for L1 cache
-       
-               for(i=0;i<d->tot_pages;i+=chunk)
-               {
-                       int bytes = ((  ((d->tot_pages-i) > (chunk))?
-                                                       (chunk):(d->tot_pages-i) ) + 7) / 8;
-           
-                       copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
-                                                 d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
-                                                 bytes );
-           
-                       for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
-                       {
-                               if( d->mm.shadow_dirty_bitmap[j] != 0 )
-                                       zero = 0;
-                       }
-
-                       memset( d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
-                                       0, bytes);
-               }
-
-        /* Might as well stop the domain as an optimization. */
-               if ( zero )
-            domain_pause_by_systemcontroller(d);
-
-               break;
-    }
-
-    case DOM0_SHADOW_CONTROL_OP_PEEK:
-    {
-               int i;
-
-               sc->stats.fault_count       = d->mm.shadow_fault_count;
-               sc->stats.dirty_count       = d->mm.shadow_dirty_count;
-               sc->stats.dirty_net_count   = d->mm.shadow_dirty_net_count;
-               sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
-       
-               if( d->tot_pages > sc->pages || 
-                       !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
-               {
-                       rc = -EINVAL;
-                       goto out;
-               }
-       
-               sc->pages = d->tot_pages;
-       
-#define chunk (8*1024) // do this in 1KB chunks for L1 cache
-       
-               for(i=0;i<d->tot_pages;i+=chunk)
-               {
-                       int bytes = ((  ((d->tot_pages-i) > (chunk))?
-                                                       (chunk):(d->tot_pages-i) ) + 7) / 8;
-           
-                       copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
-                                                 d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
-                                                 bytes );          
-               }
-
-               break;
-    }
-
-       default:
-               BUG();
-
-    }
-
-
-out:
-
-    SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
-
-    shadow_audit(m,1);
-
-    // call shadow_mk_pagetable
-    __shadow_mk_pagetable( m );
-
-    return rc;
-}
-
-int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc )
-{
-    unsigned int cmd = sc->op;
-    int rc = 0;
-
-    spin_lock(&p->mm.shadow_lock);
-
-    if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
-    {
-        shadow_mode_disable(p);
-    }
-    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
-    {
-        if(p->mm.shadow_mode) shadow_mode_disable(p);
-        shadow_mode_enable(p, SHM_test);
-    } 
-    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
-    {
-        if(p->mm.shadow_mode) shadow_mode_disable(p);
-        shadow_mode_enable(p, SHM_logdirty);
-    } 
-    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
-    {
-        rc = shadow_mode_table_op(p, sc);
-    }
-    else
-    {
-        rc = -EINVAL;
-    }
-
-       flush_tlb_cpu(p->processor);
-   
-    spin_unlock(&p->mm.shadow_lock);
-
-    return rc;
-}
-
-
-
-static inline struct pfn_info *alloc_shadow_page( struct mm_struct *m )
-{
-    m->shadow_page_count++;
-
-    return alloc_domain_page( NULL );
-}
-
-
-void unshadow_table( unsigned long gpfn, unsigned int type )
-{
-    unsigned long spfn;
-
-    SH_VLOG("unshadow_table type=%08x gpfn=%08lx",
-            type,
-            gpfn );
-
-    perfc_incrc(unshadow_table_count);
-
-    // this function is the same for both l1 and l2 tables
-
-    // even in the SMP guest case, there won't be a race here as
-    // this CPU was the one that cmpxchg'ed the page to invalid
-
-    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
-
-    delete_shadow_status(&current->mm, gpfn);
-
-    free_shadow_page( &current->mm, &frame_table[spfn] );
-
-}
-
-
-unsigned long shadow_l2_table( 
-    struct mm_struct *m, unsigned long gpfn )
-{
-    struct pfn_info *spfn_info;
-    unsigned long spfn;
-    l2_pgentry_t *spl2e, *gpl2e;
-    int i;
-
-    SH_VVLOG("shadow_l2_table( %08lx )",gpfn);
-
-    perfc_incrc(shadow_l2_table_count);
-
-    // XXX in future, worry about racing in SMP guests 
-    //      -- use cmpxchg with PSH_pending flag to show progress (and spin)
-
-    spfn_info = alloc_shadow_page(m);
-
-    ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache
-
-    spfn_info->type_and_flags = PGT_l2_page_table;
-    perfc_incr(shadow_l2_pages);
-
-    spfn = (unsigned long) (spfn_info - frame_table);
-
-    // mark pfn as being shadowed, update field to point at shadow
-    set_shadow_status(m, gpfn, spfn | PSH_shadowed);
-    // we need to do this before the linear map is set up
-    spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
-
-#ifdef __i386__
-    // get hypervisor and 2x linear PT mapings installed 
-    memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-    spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-    spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-    spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry(__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | 
-                      __PAGE_HYPERVISOR);
-#endif
-
-    // can't use the linear map as we may not be in the right PT
-    gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
-
-    // proactively create entries for pages that are already shadowed
-    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-    {
-        unsigned long spte = 0;
-
-#if 0  // Turns out this doesn't really help
-        unsigned long gpte;
-
-        gpte = l2_pgentry_val(gpl2e[i]);
-
-        if (gpte & _PAGE_PRESENT)
-        {
-            unsigned long s_sh = 
-                __shadow_status(p, gpte>>PAGE_SHIFT);
-
-            l2pde_general( m, &gpte, &spte, s_sh );
-
-        }
-#endif
-
-        spl2e[i] = mk_l2_pgentry( spte );
-
-    }
-
-    // its arguable we should 'preemptively shadow' a few active L1 pages
-    // to avoid taking a string of faults when 'jacking' a running domain
-
-    unmap_domain_mem( gpl2e );
-    unmap_domain_mem( spl2e );
-
-    SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn);
-
-    return spfn;
-}
-
-
-int shadow_fault( unsigned long va, long error_code )
-{
-    unsigned long gpte, spte;
-    struct mm_struct *m = &current->mm;
-
-    SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code );
-
-    check_pagetable( current, current->mm.pagetable, "pre-sf" );
-
-    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
-        return 0;  // propagate to guest
-    }
-
-    if ( ! (gpte & _PAGE_PRESENT) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
-        return 0;  // we're not going to be able to help
-    }
-
-    if ( (error_code & 2)  && ! (gpte & _PAGE_RW) )
-    {
-        // write fault on RO page
-        return 0;
-    }
-
-    // take the lock and reread gpte
-
-    while( unlikely(!spin_trylock(&current->mm.shadow_lock)) )
-       {
-               extern volatile unsigned long flush_cpumask;
-               if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
-                       local_flush_tlb();
-               rep_nop();
-       }
-       
-       ASSERT(spin_is_locked(&current->mm.shadow_lock));
-       
-    if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
-        spin_unlock(&m->shadow_lock);
-        return 0;  // propagate to guest
-    }
-
-    if ( unlikely(!(gpte & _PAGE_PRESENT)) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
-        spin_unlock(&m->shadow_lock);
-        return 0;  // we're not going to be able to help
-    }
-
-    if ( error_code & 2  )  
-    {  // write fault
-        if ( likely(gpte & _PAGE_RW) )
-        {
-            l1pte_write_fault( m, &gpte, &spte );
-        }
-        else
-        {   // write fault on RO page
-            SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte );
-            spin_unlock(&m->shadow_lock);
-            return 0; // propagate to guest
-            // not clear whether we should set accessed bit here...
-        }
-    }
-    else
-    {
-        l1pte_read_fault( m, &gpte, &spte );
-    }
-
-    SH_VVLOG("plan: gpte=%08lx  spte=%08lx", gpte, spte );
-
-    // write back updated gpte
-    // XXX watch out for read-only L2 entries! (not used in Linux)
-    if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
-        BUG();  // fixme!
-
-    if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) )
-    { 
-        // failed:
-        //  the L1 may not be shadowed, or the L2 entry may be insufficient
-
-        unsigned long gpde, spde, gl1pfn, sl1pfn;
-
-        SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx  spte=%08lx",gpte,spte );
-
-        gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]);
-
-        gl1pfn = gpde>>PAGE_SHIFT;
-
-        
-        if ( ! (sl1pfn=__shadow_status(&current->mm, gl1pfn) ) )
-        {
-            // this L1 is NOT already shadowed so we need to shadow it
-            struct pfn_info *sl1pfn_info;
-            unsigned long *gpl1e, *spl1e;
-            int i;
-            sl1pfn_info = alloc_shadow_page( &current->mm ); 
-            sl1pfn_info->type_and_flags = PGT_l1_page_table;
-                       
-            sl1pfn = sl1pfn_info - frame_table;
-
-            SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn);
-            perfc_incrc(shadow_l1_table_count);
-            perfc_incr(shadow_l1_pages);
-
-            set_shadow_status(&current->mm, gl1pfn, PSH_shadowed | sl1pfn);
-
-            l2pde_general( m, &gpde, &spde, sl1pfn );
-
-            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
-            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] =  mk_l2_pgentry(spde);
-
-            gpl1e = (unsigned long *) &(linear_pg_table[
-                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]);
-
-            spl1e = (unsigned long *) &shadow_linear_pg_table[
-                (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ];
-
-
-            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
-            {
-                l1pte_no_fault( m, &gpl1e[i], &spl1e[i] );
-            }
-
-
-        }
-        else
-        {
-            // this L1 was shadowed (by another PT) but we didn't have an L2
-            // entry for it
-
-            SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn);
-
-            l2pde_general( m, &gpde, &spde, sl1pfn );
-
-            linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
-            shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
-   
-        }              
-
-        shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte);
-        // (we need to do the above even if we've just made the shadow L1)
-
-    } // end of fixup writing the shadow L1 directly failed
-     
-    perfc_incrc(shadow_fixup_count);
-
-       m->shadow_fault_count++;
-
-    check_pagetable( current, current->mm.pagetable, "post-sf" );
-
-    spin_unlock(&m->shadow_lock);
-
-    return 1; // let's try the faulting instruction again...
-
-}
-
-
-void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
-                                 unsigned long *prev_spfn_ptr,
-                                 l1_pgentry_t **prev_spl1e_ptr )
-{
-    unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr;    
-    l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr;
-
-
-    SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%p\n",
-             pa,gpte,prev_spfn, prev_spl1e);
-
-    // to get here, we know the l1 page *must* be shadowed
-
-    gpfn = pa >> PAGE_SHIFT;
-    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
-
-    if ( spfn == prev_spfn )
-    {
-        spl1e = prev_spl1e;
-    }
-    else
-    {
-        if( prev_spl1e ) unmap_domain_mem( prev_spl1e );
-        spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
-        *prev_spfn_ptr  = spfn;
-        *prev_spl1e_ptr = spl1e;
-    }
-
-    // XXX we assume only pagetables can be shadowed; 
-    // this will have to change to allow arbitrary CoW etc.
-
-    l1pte_no_fault( &current->mm, &gpte, &spte );
-
-
-    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = mk_l1_pgentry( spte );
-
-}
-
-void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte )
-{
-    unsigned long gpfn, spfn, spte;
-    l2_pgentry_t * sp2le;
-    unsigned long s_sh=0;
-
-    SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte);
-
-    // to get here, we know the l2 page has a shadow
-
-    gpfn = pa >> PAGE_SHIFT;
-    spfn = __shadow_status(&current->mm, gpfn) & PSH_pfn_mask;
-
-
-    spte = 0;
-
-    if( gpte & _PAGE_PRESENT )
-        s_sh = __shadow_status(&current->mm, gpte >> PAGE_SHIFT);
-
-    sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
-    // no real need for a cache here
-
-    l2pde_general( &current->mm, &gpte, &spte, s_sh );
-
-    // XXXX Should mark guest pte as DIRTY and ACCESSED too!!!!!
-
-    sp2le[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t) ] = 
-        mk_l2_pgentry( spte );
-
-    unmap_domain_mem( (void *) sp2le );
-}
-
-
-#if SHADOW_DEBUG
-
-static int sh_l2_present;
-static int sh_l1_present;
-char * sh_check_name;
-
-#define FAIL(_f, _a...)                             \
-{printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n",  sh_check_name, level, i, ## _a , gpte, spte ); BUG();}
-
-static int check_pte( struct mm_struct *m, 
-                      unsigned long gpte, unsigned long spte, int level, int i )
-{
-    unsigned long mask, gpfn, spfn;
-
-    if ( spte == 0 || spte == 0xdeadface || spte == 0x00000E00)
-        return 1;  // always safe
-
-    if ( !(spte & _PAGE_PRESENT) )
-        FAIL("Non zero not present spte");
-
-    if( level == 2 ) sh_l2_present++;
-    if( level == 1 ) sh_l1_present++;
-
-    if ( !(gpte & _PAGE_PRESENT) )
-        FAIL("Guest not present yet shadow is");
-
-    mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000);
-
-    if ( (spte & mask) != (gpte & mask ) )
-        FAIL("Corrupt?");
-
-    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
-        FAIL("Dirty coherence");
-
-    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
-        FAIL("Accessed coherence");
-
-    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
-        FAIL("RW coherence");
-
-    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY) ))
-        FAIL("RW2 coherence");
-    spfn = spte>>PAGE_SHIFT;
-    gpfn = gpte>>PAGE_SHIFT;
-
-    if ( gpfn == spfn )
-    {
-        if ( level > 1 )
-            FAIL("Linear map ???");    // XXX this will fail on BSD
-
-        return 1;
-    }
-    else
-    {
-        if ( level < 2 )
-            FAIL("Shadow in L1 entry?");
-
-        if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) )
-            FAIL("spfn problem g.sf=%08lx", 
-                 __shadow_status(p, gpfn) );
-    }
-
-    return 1;
-}
-
-
-static int check_l1_table( struct mm_struct *m, unsigned long va, 
-                           unsigned long g2, unsigned long s2 )
-{
-    int j;
-    unsigned long *gpl1e, *spl1e;
-
-    //gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]);
-    //spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]);
-
-    gpl1e = map_domain_mem( g2<<PAGE_SHIFT );
-    spl1e = map_domain_mem( s2<<PAGE_SHIFT );
-
-    for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
-    {
-        unsigned long gpte = gpl1e[j];
-        unsigned long spte = spl1e[j];
-  
-        check_pte( p, gpte, spte, 1, j );
-    }
-    unmap_domain_mem( spl1e );
-    unmap_domain_mem( gpl1e );
-
-    return 1;
-}
-
-#define FAILPT(_f, _a...)                             \
-{printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); BUG();}
-
-int check_pagetable( struct mm_struct *m, pagetable_t pt, char *s )
-{
-    unsigned long gptbase = pagetable_val(pt);
-    unsigned long gpfn, spfn;
-    int i;
-    l2_pgentry_t *gpl2e, *spl2e;
-
-    sh_check_name = s;
-
-    SH_VVLOG("%s-PT Audit",s);
-
-    sh_l2_present = sh_l1_present = 0;
-
-    gpfn =  gptbase >> PAGE_SHIFT;
-
-    if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) )
-    {
-        printk("%s-PT %08lx not shadowed\n", s, gptbase);
-
-        if( __shadow_status(p, gpfn) != 0 ) BUG();
-
-        return 0;
-    }
-    spfn = __shadow_status(p, gpfn) & PSH_pfn_mask;
-
-    if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) )
-        FAILPT("ptbase shadow inconsistent1");
-
-    gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
-    spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
-
-    //ipl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
-
-
-    if ( memcmp( &spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-                 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-                 ((SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT))-DOMAIN_ENTRIES_PER_L2_PAGETABLE)
-                 * sizeof(l2_pgentry_t)) )
-    {
-        printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
-        for (i=DOMAIN_ENTRIES_PER_L2_PAGETABLE; 
-             i<(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT));
-             i++ )
-            printk("+++ (%d) %08lx %08lx\n",i,
-                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]) );
-        FAILPT("hypervisor entries inconsistent");
-    }
-
-    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
-          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
-        FAILPT("hypervisor linear map inconsistent");
-
-    if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
-          ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
-        FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx",
-               l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]),
-               (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR
-            );
-
-    if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
-          ((__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) )
-        FAILPT("hypervisor per-domain map inconsistent");
-
-
-    // check the whole L2
-    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-    {
-        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
-        unsigned long spte = l2_pgentry_val(spl2e[i]);
-
-        check_pte( p, gpte, spte, 2, i );
-    }
-
-
-    // go back and recurse
-    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-    {
-        unsigned long gpte = l2_pgentry_val(gpl2e[i]);
-        unsigned long spte = l2_pgentry_val(spl2e[i]);
-
-        if ( spte )    
-            check_l1_table( p,
-                            i<<L2_PAGETABLE_SHIFT,
-                            gpte>>PAGE_SHIFT, spte>>PAGE_SHIFT );
-
-    }
-
-    unmap_domain_mem( spl2e );
-    unmap_domain_mem( gpl2e );
-
-    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n",
-             sh_l2_present, sh_l1_present );
-    return 1;
-}
-
-
-#endif
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
new file mode 100644 (file)
index 0000000..df24329
--- /dev/null
@@ -0,0 +1,604 @@
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*- */
+
+#ifndef _XEN_SHADOW_H
+#define _XEN_SHADOW_H
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/perfc.h>
+#include <asm/processor.h>
+
+
+/* Shadow PT flag bits in pfn_info */
+#define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
+#define PSH_pending     (1<<29) /* page is in the process of being shadowed */
+#define PSH_pfn_mask    ((1<<21)-1)
+
+/* Shadow PT operation mode : shadowmode variable in mm_struct */
+#define SHM_test        (1) /* just run domain on shadow PTs */
+#define SHM_logdirty    (2) /* log pages that are dirtied */
+#define SHM_translate   (3) /* lookup machine pages in translation table */
+//#define SHM_cow       (4) /* copy on write all dirtied pages */
+
+
+#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
+#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
+
+extern void shadow_mode_init(void);
+extern int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc );
+extern int shadow_fault( unsigned long va, long error_code );
+extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, 
+                                        unsigned long *prev_spfn_ptr,
+                                        l1_pgentry_t **prev_spl1e_ptr  );
+extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte );
+extern void unshadow_table( unsigned long gpfn, unsigned int type );
+extern int shadow_mode_enable( struct domain *p, unsigned int mode );
+extern void shadow_mode_disable( struct domain *p );
+extern unsigned long shadow_l2_table( 
+    struct mm_struct *m, unsigned long gpfn );
+
+#define SHADOW_DEBUG 0
+#define SHADOW_HASH_DEBUG 0
+#define SHADOW_OPTIMISE 1
+
+struct shadow_status {
+    unsigned long pfn;            // gpfn 
+    unsigned long spfn_and_flags; // spfn plus flags
+    struct shadow_status *next;   // use pull-to-front list.
+};
+
+#define shadow_ht_extra_size         128 /*128*/
+#define shadow_ht_buckets            256 /*256*/
+
+#ifndef NDEBUG
+#define SH_LOG(_f, _a...)                             \
+printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",    \
+       current->domain , __LINE__ , ## _a )
+#else
+#define SH_LOG(_f, _a...) 
+#endif
+
+#if SHADOW_DEBUG
+#define SH_VLOG(_f, _a...)                             \
+    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
+           current->domain , __LINE__ , ## _a )
+#else
+#define SH_VLOG(_f, _a...) 
+#endif
+
+#if 0
+#define SH_VVLOG(_f, _a...)                             \
+    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",  \
+           current->domain , __LINE__ , ## _a )
+#else
+#define SH_VVLOG(_f, _a...) 
+#endif
+
+
+/************************************************************************/
+
+#define shadow_mode(d)         (d->mm.shadow_mode)
+#define        shadow_lock_init(d)     spin_lock_init(&d->mm.shadow_lock)
+
+/************************************************************************/
+
+static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn )
+{
+    unsigned int pfn;
+    int rc = 0;
+
+    ASSERT(spin_is_locked(&m->shadow_lock));
+
+    pfn = machine_to_phys_mapping[mfn];
+
+    /* We use values with the top bit set to mark MFNs that aren't
+       really part of the domain's psuedo-physical memory map e.g.
+       the shared info frame. Nothing to do here...
+    */
+    if ( unlikely(pfn & 0x80000000U) ) return rc; 
+
+    ASSERT(m->shadow_dirty_bitmap);
+    if( likely(pfn<m->shadow_dirty_bitmap_size) )
+    {
+       /* These updates occur with mm.shadow_lock held, so use 
+          (__) version of test_and_set */
+       if( __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) == 0 )
+       {
+           // if we set it
+           m->shadow_dirty_count++;
+           rc = 1;
+       }
+    }
+    else
+    {
+        extern void show_traceX(void);
+        SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
+               mfn, pfn, m->shadow_dirty_bitmap_size, m );
+        SH_LOG("dom=%u caf=%08x taf=%08x\n", 
+               frame_table[mfn].u.domain->domain,
+               frame_table[mfn].count_and_flags, 
+               frame_table[mfn].type_and_flags );
+    }
+       
+    return rc;
+}
+
+
+static inline int mark_dirty( struct mm_struct *m, unsigned int mfn )
+{
+    int rc;
+    ASSERT(local_irq_is_enabled());
+    //if(spin_is_locked(&m->shadow_lock)) printk("+");
+    spin_lock(&m->shadow_lock);
+    rc = __mark_dirty( m, mfn );
+    spin_unlock(&m->shadow_lock);
+    return rc;
+}
+
+
+/************************************************************************/
+
+static inline void l1pte_write_fault( struct mm_struct *m, 
+                                      unsigned long *gpte_p, unsigned long *spte_p )
+{ 
+    unsigned long gpte = *gpte_p;
+    unsigned long spte = *spte_p;
+
+    switch( m->shadow_mode )
+    {
+    case SHM_test:
+        spte = gpte;
+        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
+        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
+        break;
+
+    case SHM_logdirty:
+        spte = gpte;
+        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
+        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
+        __mark_dirty( m, (gpte >> PAGE_SHIFT) );
+        break;
+    }
+
+    *gpte_p = gpte;
+    *spte_p = spte;
+}
+
+static inline void l1pte_read_fault( struct mm_struct *m, 
+                                     unsigned long *gpte_p, unsigned long *spte_p )
+{ 
+    unsigned long gpte = *gpte_p;
+    unsigned long spte = *spte_p;
+
+    switch( m->shadow_mode )
+    {
+    case SHM_test:
+        spte = gpte;
+        gpte |= _PAGE_ACCESSED;
+        spte |= _PAGE_ACCESSED;
+        if ( ! (gpte & _PAGE_DIRTY ) )
+            spte &= ~ _PAGE_RW;
+        break;
+
+    case SHM_logdirty:
+        spte = gpte;
+        gpte |= _PAGE_ACCESSED;
+        spte |= _PAGE_ACCESSED;
+        spte &= ~ _PAGE_RW;
+        break;
+    }
+
+    *gpte_p = gpte;
+    *spte_p = spte;
+}
+
+static inline void l1pte_no_fault( struct mm_struct *m, 
+                                   unsigned long *gpte_p, unsigned long *spte_p )
+{ 
+    unsigned long gpte = *gpte_p;
+    unsigned long spte = *spte_p;
+
+    switch( m->shadow_mode )
+    {
+    case SHM_test:
+        spte = 0;
+        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
+             (_PAGE_PRESENT|_PAGE_ACCESSED) )
+        {
+            spte = gpte;
+            if ( ! (gpte & _PAGE_DIRTY ) )
+                spte &= ~ _PAGE_RW;
+        }
+        break;
+
+    case SHM_logdirty:
+        spte = 0;
+        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
+             (_PAGE_PRESENT|_PAGE_ACCESSED) )
+        {
+            spte = gpte;
+            spte &= ~ _PAGE_RW;
+        }
+
+        break;
+    }
+
+    *gpte_p = gpte;
+    *spte_p = spte;
+}
+
+static inline void l2pde_general( struct mm_struct *m, 
+                                  unsigned long *gpde_p, unsigned long *spde_p,
+                                  unsigned long sl1pfn)
+{
+    unsigned long gpde = *gpde_p;
+    unsigned long spde = *spde_p;
+
+    spde = 0;
+
+    if ( sl1pfn )
+    {
+        spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) | 
+            _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
+        gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
+
+        if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK)  ) )
+        {   
+            // detect linear map, and keep pointing at guest
+            SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
+            spde = gpde & ~_PAGE_RW;
+        }
+    }
+
+    *gpde_p = gpde;
+    *spde_p = spde;
+}
+
+/*********************************************************************/
+
+
+
+#if SHADOW_HASH_DEBUG
+static void shadow_audit(struct mm_struct *m, int print)
+{
+    int live=0, free=0, j=0, abs;
+    struct shadow_status *a;
+
+    for( j = 0; j < shadow_ht_buckets; j++ )
+    {
+        a = &m->shadow_ht[j];        
+        if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
+        ASSERT((a->pfn&0xf0000000)==0);
+        ASSERT(a->pfn<0x00100000);
+        a=a->next;
+        while(a && live<9999)
+        { 
+            live++; 
+            if(a->pfn == 0 || a->spfn_and_flags == 0)
+            {
+                printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
+                       live, a->pfn, a->spfn_and_flags, a->next);
+                BUG();
+            }
+            ASSERT(a->pfn);
+            ASSERT((a->pfn&0xf0000000)==0);
+            ASSERT(a->pfn<0x00100000);
+            ASSERT(a->spfn_and_flags&PSH_pfn_mask);
+            a=a->next; 
+        }
+        ASSERT(live<9999);
+    }
+
+    a = m->shadow_ht_free;
+    while(a) { free++; a=a->next; }
+
+    if(print) printk("Xlive=%d free=%d\n",live,free);
+
+    abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
+    if( abs < -1 || abs > 1 )
+    {
+        printk("live=%d free=%d l1=%d l2=%d\n",live,free,
+               perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
+        BUG();
+    }
+
+}
+
+#else
+#define shadow_audit(p, print)
+#endif
+
+
+
+static inline struct shadow_status* hash_bucket( struct mm_struct *m,
+                                                 unsigned int gpfn )
+{
+    return &(m->shadow_ht[gpfn % shadow_ht_buckets]);
+}
+
+
+static inline unsigned long __shadow_status( struct mm_struct *m,
+                                             unsigned int gpfn )
+{
+    struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
+
+    b = B;
+    ob = NULL;
+
+    SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
+    shadow_audit(m,0);  // if in debug mode
+
+    do
+    {
+        if ( b->pfn == gpfn )
+        {
+            unsigned long t;
+            struct shadow_status *x;
+
+            // swap with head
+            t=B->pfn; B->pfn=b->pfn; b->pfn=t;
+            t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; 
+            b->spfn_and_flags=t;
+
+            if( ob )
+            {   // pull to front
+                *ob=b->next;
+                x=B->next;
+                B->next=b;
+                b->next=x;
+            }
+            return B->spfn_and_flags;
+        }
+#if SHADOW_HASH_DEBUG
+        else
+        {
+            if(b!=B)ASSERT(b->pfn);
+        }
+#endif
+        ob=&b->next;
+        b=b->next;
+    }
+    while (b);
+
+    return 0;
+}
+
+/* we can make this locking more fine grained e.g. per shadow page if it 
+ever becomes a problem, but since we need a spin lock on the hash table 
+anyway its probably not worth being too clever. */
+
+static inline unsigned long get_shadow_status( struct mm_struct *m,
+                                               unsigned int gpfn )
+{
+    unsigned long res;
+
+    /* If we get here, we know that this domain is running in shadow mode. 
+       We also know that some sort of update has happened to the underlying
+       page table page: either a PTE has been updated, or the page has
+       changed type. If we're in log dirty mode, we should set the approrpiate
+       bit in the dirty bitmap.
+       NB: the VA update path doesn't use this so needs to be handled 
+       independnetly. 
+    */
+
+    ASSERT(local_irq_is_enabled());
+    //if(spin_is_locked(&m->shadow_lock)) printk("*");
+    spin_lock(&m->shadow_lock);
+
+    if( m->shadow_mode == SHM_logdirty )
+        __mark_dirty( m, gpfn );
+
+    res = __shadow_status( m, gpfn );
+    if (!res) spin_unlock(&m->shadow_lock);
+    return res;
+}
+
+
+static inline void put_shadow_status( struct mm_struct *m )
+{
+    spin_unlock(&m->shadow_lock);
+}
+
+
+static inline void delete_shadow_status( struct mm_struct *m,
+                                         unsigned int gpfn )
+{
+    struct shadow_status *b, *B, **ob;
+
+    ASSERT(spin_is_locked(&m->shadow_lock));
+
+    B = b = hash_bucket( m, gpfn );
+
+    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
+    shadow_audit(m,0);
+    ASSERT(gpfn);
+
+    if( b->pfn == gpfn )
+    {
+        if (b->next)
+        {
+            struct shadow_status *D=b->next;
+            b->spfn_and_flags = b->next->spfn_and_flags;
+            b->pfn = b->next->pfn;
+
+            b->next = b->next->next;
+            D->next = m->shadow_ht_free;
+            D->pfn = 0;
+            D->spfn_and_flags = 0;
+            m->shadow_ht_free = D;
+        }
+        else
+        {
+            b->pfn = 0;
+            b->spfn_and_flags = 0;
+        }
+
+#if SHADOW_HASH_DEBUG
+        if( __shadow_status(m,gpfn) ) BUG();  
+        shadow_audit(m,0);
+#endif
+        return;
+    }
+
+    ob = &b->next;
+    b=b->next;
+
+    do
+    {
+        if ( b->pfn == gpfn )
+        {
+            b->pfn = 0;
+            b->spfn_and_flags = 0;
+
+            // b is in the list
+            *ob=b->next;
+            b->next = m->shadow_ht_free;
+            m->shadow_ht_free = b;
+
+#if SHADOW_HASH_DEBUG
+            if( __shadow_status(m,gpfn) ) BUG();
+#endif
+            shadow_audit(m,0);
+            return;
+        }
+
+        ob = &b->next;
+        b=b->next;
+    }
+    while (b);
+
+    // if we got here, it wasn't in the list
+    BUG();
+}
+
+
+static inline void set_shadow_status( struct mm_struct *m,
+                                      unsigned int gpfn, unsigned long s )
+{
+    struct shadow_status *b, *B, *extra, **fptr;
+    int i;
+
+    ASSERT(spin_is_locked(&m->shadow_lock));
+
+    B = b = hash_bucket( m, gpfn );
+   
+    ASSERT(gpfn);
+    SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next );
+
+    shadow_audit(m,0);
+
+    do
+    {
+        if ( b->pfn == gpfn )
+        {
+            b->spfn_and_flags = s;
+            shadow_audit(m,0);
+            return;
+        }
+
+        b=b->next;
+    }
+    while (b);
+
+    // if we got here, this is an insert rather than update
+
+    ASSERT( s );  // deletes must have succeeded by here
+
+    if ( B->pfn == 0 )
+    {
+        // we can use this head
+        ASSERT( B->next == 0 );
+        B->pfn = gpfn;
+        B->spfn_and_flags = s;
+        shadow_audit(m,0);
+        return;
+    }
+
+    if( unlikely(m->shadow_ht_free == NULL) )
+    {
+        SH_LOG("allocate more shadow hashtable blocks");
+
+        // we need to allocate more space
+        extra = kmalloc(sizeof(void*) + (shadow_ht_extra_size * 
+                                         sizeof(struct shadow_status)));
+
+        if( ! extra ) BUG(); // should be more graceful here....
+
+        memset(extra, 0, sizeof(void*) + (shadow_ht_extra_size * 
+                                          sizeof(struct shadow_status)));
+
+        m->shadow_extras_count++;
+
+        // add extras to free list
+        fptr = &m->shadow_ht_free;
+        for ( i=0; i<shadow_ht_extra_size; i++ )
+        {
+            *fptr = &extra[i];
+            fptr = &(extra[i].next);
+        }
+        *fptr = NULL;
+
+        *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) = 
+            m->shadow_ht_extras;
+        m->shadow_ht_extras = extra;
+
+    }
+
+    // should really put this in B to go right to front
+    b = m->shadow_ht_free;
+    m->shadow_ht_free = b->next;
+    b->spfn_and_flags = s;
+    b->pfn = gpfn;
+    b->next = B->next;
+    B->next = b;
+
+    shadow_audit(m,0);
+
+    return;
+}
+
+static inline void __shadow_mk_pagetable( struct mm_struct *mm )
+{
+    unsigned long gpfn, spfn=0;
+
+    gpfn =  pagetable_val(mm->pagetable) >> PAGE_SHIFT;
+
+    if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
+    {
+        spfn = shadow_l2_table(mm, gpfn );
+    }      
+    mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+}
+
+static inline void shadow_mk_pagetable( struct mm_struct *mm )
+{
+    SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
+             pagetable_val(mm->pagetable), mm->shadow_mode );
+
+    if ( unlikely(mm->shadow_mode) )
+    {
+        ASSERT(local_irq_is_enabled());
+        spin_lock(&mm->shadow_lock);
+
+        __shadow_mk_pagetable( mm );
+
+        spin_unlock(&mm->shadow_lock);
+    }
+
+    SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
+             pagetable_val(mm->pagetable), mm->shadow_mode, 
+             pagetable_val(mm->shadow_table) );
+
+}
+
+
+#if SHADOW_DEBUG
+extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s);
+#else
+#define check_pagetable(m, pt, s) ((void)0)
+#endif
+
+
+#endif /* XEN_SHADOW_H */
+
+
index e4dbd1b0617a35824cbec6d37dbf119a45570717..c59e02e14c1519747941db15b89f76e62e5cd43b 100644 (file)
@@ -1,599 +1 @@
-/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*- */
-
-#ifndef _XEN_SHADOW_H
-#define _XEN_SHADOW_H
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/perfc.h>
-#include <asm/processor.h>
-
-
-/* Shadow PT flag bits in pfn_info */
-#define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
-#define PSH_pending     (1<<29) /* page is in the process of being shadowed */
-#define PSH_pfn_mask    ((1<<21)-1)
-
-/* Shadow PT operation mode : shadowmode variable in mm_struct */
-#define SHM_test        (1) /* just run domain on shadow PTs */
-#define SHM_logdirty    (2) /* log pages that are dirtied */
-#define SHM_translate   (3) /* lookup machine pages in translation table */
-//#define SHM_cow       (4) /* copy on write all dirtied pages */
-
-
-#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
-
-extern void shadow_mode_init(void);
-extern int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc );
-extern int shadow_fault( unsigned long va, long error_code );
-extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, 
-                                        unsigned long *prev_spfn_ptr,
-                                        l1_pgentry_t **prev_spl1e_ptr  );
-extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte );
-extern void unshadow_table( unsigned long gpfn, unsigned int type );
-extern int shadow_mode_enable( struct domain *p, unsigned int mode );
-extern void shadow_mode_disable( struct domain *p );
-extern unsigned long shadow_l2_table( 
-    struct mm_struct *m, unsigned long gpfn );
-
-#define SHADOW_DEBUG 0
-#define SHADOW_HASH_DEBUG 0
-#define SHADOW_OPTIMISE 1
-
-struct shadow_status {
-    unsigned long pfn;            // gpfn 
-    unsigned long spfn_and_flags; // spfn plus flags
-    struct shadow_status *next;   // use pull-to-front list.
-};
-
-#define shadow_ht_extra_size         128 /*128*/
-#define shadow_ht_buckets            256 /*256*/
-
-#ifndef NDEBUG
-#define SH_LOG(_f, _a...)                             \
-printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",    \
-       current->domain , __LINE__ , ## _a )
-#else
-#define SH_LOG(_f, _a...) 
-#endif
-
-#if SHADOW_DEBUG
-#define SH_VLOG(_f, _a...)                             \
-    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
-           current->domain , __LINE__ , ## _a )
-#else
-#define SH_VLOG(_f, _a...) 
-#endif
-
-#if 0
-#define SH_VVLOG(_f, _a...)                             \
-    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",  \
-           current->domain , __LINE__ , ## _a )
-#else
-#define SH_VVLOG(_f, _a...) 
-#endif
-
-
-/************************************************************************/
-
-static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn )
-{
-    unsigned int pfn;
-    int rc = 0;
-
-    ASSERT(spin_is_locked(&m->shadow_lock));
-
-    pfn = machine_to_phys_mapping[mfn];
-
-    /* We use values with the top bit set to mark MFNs that aren't
-       really part of the domain's psuedo-physical memory map e.g.
-       the shared info frame. Nothing to do here...
-    */
-    if ( unlikely(pfn & 0x80000000U) ) return rc; 
-
-    ASSERT(m->shadow_dirty_bitmap);
-    if( likely(pfn<m->shadow_dirty_bitmap_size) )
-    {
-       /* These updates occur with mm.shadow_lock held, so use 
-          (__) version of test_and_set */
-       if( __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) == 0 )
-       {
-           // if we set it
-           m->shadow_dirty_count++;
-           rc = 1;
-       }
-    }
-    else
-    {
-        extern void show_traceX(void);
-        SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
-               mfn, pfn, m->shadow_dirty_bitmap_size, m );
-        SH_LOG("dom=%u caf=%08x taf=%08x\n", 
-               frame_table[mfn].u.domain->domain,
-               frame_table[mfn].count_and_flags, 
-               frame_table[mfn].type_and_flags );
-    }
-       
-    return rc;
-}
-
-
-static inline int mark_dirty( struct mm_struct *m, unsigned int mfn )
-{
-    int rc;
-    ASSERT(local_irq_is_enabled());
-    //if(spin_is_locked(&m->shadow_lock)) printk("+");
-    spin_lock(&m->shadow_lock);
-    rc = __mark_dirty( m, mfn );
-    spin_unlock(&m->shadow_lock);
-    return rc;
-}
-
-
-/************************************************************************/
-
-static inline void l1pte_write_fault( struct mm_struct *m, 
-                                      unsigned long *gpte_p, unsigned long *spte_p )
-{ 
-    unsigned long gpte = *gpte_p;
-    unsigned long spte = *spte_p;
-
-    switch( m->shadow_mode )
-    {
-    case SHM_test:
-        spte = gpte;
-        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
-        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
-        break;
-
-    case SHM_logdirty:
-        spte = gpte;
-        gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
-        spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
-        __mark_dirty( m, (gpte >> PAGE_SHIFT) );
-        break;
-    }
-
-    *gpte_p = gpte;
-    *spte_p = spte;
-}
-
-static inline void l1pte_read_fault( struct mm_struct *m, 
-                                     unsigned long *gpte_p, unsigned long *spte_p )
-{ 
-    unsigned long gpte = *gpte_p;
-    unsigned long spte = *spte_p;
-
-    switch( m->shadow_mode )
-    {
-    case SHM_test:
-        spte = gpte;
-        gpte |= _PAGE_ACCESSED;
-        spte |= _PAGE_ACCESSED;
-        if ( ! (gpte & _PAGE_DIRTY ) )
-            spte &= ~ _PAGE_RW;
-        break;
-
-    case SHM_logdirty:
-        spte = gpte;
-        gpte |= _PAGE_ACCESSED;
-        spte |= _PAGE_ACCESSED;
-        spte &= ~ _PAGE_RW;
-        break;
-    }
-
-    *gpte_p = gpte;
-    *spte_p = spte;
-}
-
-static inline void l1pte_no_fault( struct mm_struct *m, 
-                                   unsigned long *gpte_p, unsigned long *spte_p )
-{ 
-    unsigned long gpte = *gpte_p;
-    unsigned long spte = *spte_p;
-
-    switch( m->shadow_mode )
-    {
-    case SHM_test:
-        spte = 0;
-        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
-             (_PAGE_PRESENT|_PAGE_ACCESSED) )
-        {
-            spte = gpte;
-            if ( ! (gpte & _PAGE_DIRTY ) )
-                spte &= ~ _PAGE_RW;
-        }
-        break;
-
-    case SHM_logdirty:
-        spte = 0;
-        if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
-             (_PAGE_PRESENT|_PAGE_ACCESSED) )
-        {
-            spte = gpte;
-            spte &= ~ _PAGE_RW;
-        }
-
-        break;
-    }
-
-    *gpte_p = gpte;
-    *spte_p = spte;
-}
-
-static inline void l2pde_general( struct mm_struct *m, 
-                                  unsigned long *gpde_p, unsigned long *spde_p,
-                                  unsigned long sl1pfn)
-{
-    unsigned long gpde = *gpde_p;
-    unsigned long spde = *spde_p;
-
-    spde = 0;
-
-    if ( sl1pfn )
-    {
-        spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) | 
-            _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
-        gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
-
-        if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK)  ) )
-        {   
-            // detect linear map, and keep pointing at guest
-            SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
-            spde = gpde & ~_PAGE_RW;
-        }
-    }
-
-    *gpde_p = gpde;
-    *spde_p = spde;
-}
-
-/*********************************************************************/
-
-
-
-#if SHADOW_HASH_DEBUG
-static void shadow_audit(struct mm_struct *m, int print)
-{
-    int live=0, free=0, j=0, abs;
-    struct shadow_status *a;
-
-    for( j = 0; j < shadow_ht_buckets; j++ )
-    {
-        a = &m->shadow_ht[j];        
-        if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
-        ASSERT((a->pfn&0xf0000000)==0);
-        ASSERT(a->pfn<0x00100000);
-        a=a->next;
-        while(a && live<9999)
-        { 
-            live++; 
-            if(a->pfn == 0 || a->spfn_and_flags == 0)
-            {
-                printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
-                       live, a->pfn, a->spfn_and_flags, a->next);
-                BUG();
-            }
-            ASSERT(a->pfn);
-            ASSERT((a->pfn&0xf0000000)==0);
-            ASSERT(a->pfn<0x00100000);
-            ASSERT(a->spfn_and_flags&PSH_pfn_mask);
-            a=a->next; 
-        }
-        ASSERT(live<9999);
-    }
-
-    a = m->shadow_ht_free;
-    while(a) { free++; a=a->next; }
-
-    if(print) printk("Xlive=%d free=%d\n",live,free);
-
-    abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
-    if( abs < -1 || abs > 1 )
-    {
-        printk("live=%d free=%d l1=%d l2=%d\n",live,free,
-               perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
-        BUG();
-    }
-
-}
-
-#else
-#define shadow_audit(p, print)
-#endif
-
-
-
-static inline struct shadow_status* hash_bucket( struct mm_struct *m,
-                                                 unsigned int gpfn )
-{
-    return &(m->shadow_ht[gpfn % shadow_ht_buckets]);
-}
-
-
-static inline unsigned long __shadow_status( struct mm_struct *m,
-                                             unsigned int gpfn )
-{
-    struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
-
-    b = B;
-    ob = NULL;
-
-    SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
-    shadow_audit(m,0);  // if in debug mode
-
-    do
-    {
-        if ( b->pfn == gpfn )
-        {
-            unsigned long t;
-            struct shadow_status *x;
-
-            // swap with head
-            t=B->pfn; B->pfn=b->pfn; b->pfn=t;
-            t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; 
-            b->spfn_and_flags=t;
-
-            if( ob )
-            {   // pull to front
-                *ob=b->next;
-                x=B->next;
-                B->next=b;
-                b->next=x;
-            }
-            return B->spfn_and_flags;
-        }
-#if SHADOW_HASH_DEBUG
-        else
-        {
-            if(b!=B)ASSERT(b->pfn);
-        }
-#endif
-        ob=&b->next;
-        b=b->next;
-    }
-    while (b);
-
-    return 0;
-}
-
-/* we can make this locking more fine grained e.g. per shadow page if it 
-ever becomes a problem, but since we need a spin lock on the hash table 
-anyway its probably not worth being too clever. */
-
-static inline unsigned long get_shadow_status( struct mm_struct *m,
-                                               unsigned int gpfn )
-{
-    unsigned long res;
-
-    /* If we get here, we know that this domain is running in shadow mode. 
-       We also know that some sort of update has happened to the underlying
-       page table page: either a PTE has been updated, or the page has
-       changed type. If we're in log dirty mode, we should set the approrpiate
-       bit in the dirty bitmap.
-       NB: the VA update path doesn't use this so needs to be handled 
-       independnetly. 
-    */
-
-    ASSERT(local_irq_is_enabled());
-    //if(spin_is_locked(&m->shadow_lock)) printk("*");
-    spin_lock(&m->shadow_lock);
-
-    if( m->shadow_mode == SHM_logdirty )
-        __mark_dirty( m, gpfn );
-
-    res = __shadow_status( m, gpfn );
-    if (!res) spin_unlock(&m->shadow_lock);
-    return res;
-}
-
-
-static inline void put_shadow_status( struct mm_struct *m )
-{
-    spin_unlock(&m->shadow_lock);
-}
-
-
-static inline void delete_shadow_status( struct mm_struct *m,
-                                         unsigned int gpfn )
-{
-    struct shadow_status *b, *B, **ob;
-
-    ASSERT(spin_is_locked(&m->shadow_lock));
-
-    B = b = hash_bucket( m, gpfn );
-
-    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
-    shadow_audit(m,0);
-    ASSERT(gpfn);
-
-    if( b->pfn == gpfn )
-    {
-        if (b->next)
-        {
-            struct shadow_status *D=b->next;
-            b->spfn_and_flags = b->next->spfn_and_flags;
-            b->pfn = b->next->pfn;
-
-            b->next = b->next->next;
-            D->next = m->shadow_ht_free;
-            D->pfn = 0;
-            D->spfn_and_flags = 0;
-            m->shadow_ht_free = D;
-        }
-        else
-        {
-            b->pfn = 0;
-            b->spfn_and_flags = 0;
-        }
-
-#if SHADOW_HASH_DEBUG
-        if( __shadow_status(m,gpfn) ) BUG();  
-        shadow_audit(m,0);
-#endif
-        return;
-    }
-
-    ob = &b->next;
-    b=b->next;
-
-    do
-    {
-        if ( b->pfn == gpfn )
-        {
-            b->pfn = 0;
-            b->spfn_and_flags = 0;
-
-            // b is in the list
-            *ob=b->next;
-            b->next = m->shadow_ht_free;
-            m->shadow_ht_free = b;
-
-#if SHADOW_HASH_DEBUG
-            if( __shadow_status(m,gpfn) ) BUG();
-#endif
-            shadow_audit(m,0);
-            return;
-        }
-
-        ob = &b->next;
-        b=b->next;
-    }
-    while (b);
-
-    // if we got here, it wasn't in the list
-    BUG();
-}
-
-
-static inline void set_shadow_status( struct mm_struct *m,
-                                      unsigned int gpfn, unsigned long s )
-{
-    struct shadow_status *b, *B, *extra, **fptr;
-    int i;
-
-    ASSERT(spin_is_locked(&m->shadow_lock));
-
-    B = b = hash_bucket( m, gpfn );
-   
-    ASSERT(gpfn);
-    SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next );
-
-    shadow_audit(m,0);
-
-    do
-    {
-        if ( b->pfn == gpfn )
-        {
-            b->spfn_and_flags = s;
-            shadow_audit(m,0);
-            return;
-        }
-
-        b=b->next;
-    }
-    while (b);
-
-    // if we got here, this is an insert rather than update
-
-    ASSERT( s );  // deletes must have succeeded by here
-
-    if ( B->pfn == 0 )
-    {
-        // we can use this head
-        ASSERT( B->next == 0 );
-        B->pfn = gpfn;
-        B->spfn_and_flags = s;
-        shadow_audit(m,0);
-        return;
-    }
-
-    if( unlikely(m->shadow_ht_free == NULL) )
-    {
-        SH_LOG("allocate more shadow hashtable blocks");
-
-        // we need to allocate more space
-        extra = kmalloc(sizeof(void*) + (shadow_ht_extra_size * 
-                                         sizeof(struct shadow_status)));
-
-        if( ! extra ) BUG(); // should be more graceful here....
-
-        memset(extra, 0, sizeof(void*) + (shadow_ht_extra_size * 
-                                          sizeof(struct shadow_status)));
-
-        m->shadow_extras_count++;
-
-        // add extras to free list
-        fptr = &m->shadow_ht_free;
-        for ( i=0; i<shadow_ht_extra_size; i++ )
-        {
-            *fptr = &extra[i];
-            fptr = &(extra[i].next);
-        }
-        *fptr = NULL;
-
-        *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) = 
-            m->shadow_ht_extras;
-        m->shadow_ht_extras = extra;
-
-    }
-
-    // should really put this in B to go right to front
-    b = m->shadow_ht_free;
-    m->shadow_ht_free = b->next;
-    b->spfn_and_flags = s;
-    b->pfn = gpfn;
-    b->next = B->next;
-    B->next = b;
-
-    shadow_audit(m,0);
-
-    return;
-}
-
-static inline void __shadow_mk_pagetable( struct mm_struct *mm )
-{
-    unsigned long gpfn, spfn=0;
-
-    gpfn =  pagetable_val(mm->pagetable) >> PAGE_SHIFT;
-
-    if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
-    {
-        spfn = shadow_l2_table(mm, gpfn );
-    }      
-    mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
-}
-
-static inline void shadow_mk_pagetable( struct mm_struct *mm )
-{
-    SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
-             pagetable_val(mm->pagetable), mm->shadow_mode );
-
-    if ( unlikely(mm->shadow_mode) )
-    {
-        ASSERT(local_irq_is_enabled());
-        spin_lock(&mm->shadow_lock);
-
-        __shadow_mk_pagetable( mm );
-
-        spin_unlock(&mm->shadow_lock);
-    }
-
-    SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
-             pagetable_val(mm->pagetable), mm->shadow_mode, 
-             pagetable_val(mm->shadow_table) );
-
-}
-
-
-#if SHADOW_DEBUG
-extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s);
-#else
-#define check_pagetable(m, pt, s) ((void)0)
-#endif
-
-
-#endif /* XEN_SHADOW_H */
-
-
+#include <asm/shadow.h>